home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
-
-
- /* Inverted file accessor functions.
-
- Main functions:
- finished_add_word
-
- */
-
-
- #include <string.h> /* for memset() */
-
- #include "cutil.h"
- #include "futil.h"
- #include "irhash.h"
- #include "panic.h"
- #include "irfiles.h"
- #include "irext.h"
-
- /* ================== */
- /* === Index file === */
- /* ================== */
-
- /*This is an implementation of the inverted file itself.
- * An index_block_number is the position in the file that the
- * entry starts.
- * Index block 0 is the null pointer.
- * The header contains the number of different words in the file.
- *
- */
-
- /* the format of an index block is:
- * in the first byte:
- * INDEX_BLOCK_FULL_FLAG
- * || INDEX_BLOCK_NOT_FULL_FLAG
- * || INDEX_BLOCK_DICTIONARY_FLAG
- * if index_block_full_flag
- * next_index_block
- * index_block_size
- * stuff
- * (the number of valid entries is index_block_size - index_block_header_size)
- * if index_block_not_full_flag
- * number_of_valid_entries
- * index_block_size
- * stuff
- * if index_block_dictionary_flag
- * last_index_block NEXT_INDEX_BLOCK_SIZE (0 when this is the last)
- * index_block_size INDEX_BLOCK_SIZE_SIZE
- * number_of_occurances NUMBER_OF_OCCURANCES_SIZE
- * word (followed by \n)
- */
-
- long
- write_dictionary_index_block(number_of_occurances,word,stream)
- long number_of_occurances;
- char *word;
- FILE *stream;
- /* returns a pointer to the index block allocated */
- {
- /* this assumes that the stream is positioned at the end */
- long before_length = ftell(stream); /* file_length(stream); */
-
- long index_block_size =
- INDEX_BLOCK_FLAG_SIZE +
- NEXT_INDEX_BLOCK_SIZE +
- INDEX_BLOCK_SIZE_SIZE +
- NUMBER_OF_OCCURANCES_SIZE +
- strlen(word) +
- strlen("\n"); /* this is done this way for PC's (necessary?) */
- /* printf("writing dict entry to position %ld\n", before_length); */
- /* grow the file */
- /* in this implementation, the file is always filled by the fwrite,
- * so it will grow itself.
- grow_file(stream, before_length + how_large);
- * file length leaves the stream at the end, so we are all set.
- s_fseek(stream, before_length, SEEK_SET);
- */
- write_bytes(INDEX_BLOCK_DICTIONARY_FLAG, INDEX_BLOCK_FLAG_SIZE,
- stream);
- write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, stream);
- write_bytes(index_block_size, INDEX_BLOCK_SIZE_SIZE, stream);
-
- write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE,
- stream);
- fprintf(stream, "%s\n", word);
- /* just to check. this will be unnecessary */
- { long after_length = ftell(stream);
- if(after_length - before_length != index_block_size){
- waislog(WLOG_HIGH, WLOG_ERROR,
- "dictionary entry size is %ld, and we thought %ld",
- after_length - before_length, index_block_size);
- }
- }
- return(before_length);
- }
-
- #ifdef everneeded
- static long modify_dictionary_index_block
- _AP((long index_block,long last_index_block,long number_of_occurances,
- FILE* stream));
-
- static long modify_dictionary_index_block(index_block,last_index_block,number_of_occurances,stream)
- long index_block;
- long last_index_block;
- long number_of_occurances;
- FILE* stream;
-
- /* this does not allow one to change the word itself, only the entries
- around it. It will panic if the index_block is not a valid block.
- This returns the the stream to pointing at the end of the file.
- */
- {
- s_fseek(stream, index_block, SEEK_SET);
- if(INDEX_BLOCK_DICTIONARY_FLAG != read_bytes(INDEX_BLOCK_FLAG_SIZE, stream))
- panic("the index block %ld is not a legal dictionary block",
- index_block);
- write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE,
- stream);
- write_bytes(last_index_block, NEXT_INDEX_BLOCK_SIZE, stream);
- read_bytes(INDEX_BLOCK_SIZE_SIZE, stream); /* ignore it */
- write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE,
- stream);
- s_fseek(stream, 0L, SEEK_END);
- }
-
- #endif /* def everneeded */
-
- static long next_dictionary_index_block
- _AP((long* index_block_size,long* number_of_occurances,char* word,
- FILE* stream));
-
- static long
- next_dictionary_index_block(index_block_size,number_of_occurances,word,stream)
- long *index_block_size;
- long *number_of_occurances;
- char *word;
- FILE *stream;
- {
- /* this read the dictionary index block from the index stream.
- It assumes the stream is positioned at the start of a dictionary
- block, and will return non-0 if it is not.
- returns 0 if it succeeds. */
- long index_block_flag;
- char temp[MAX_WORD_LENGTH + 2];
-
- word[0] = '\0';
-
- index_block_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
- if(index_block_flag == EOF)
- return(-1);
- if(index_block_flag != INDEX_BLOCK_DICTIONARY_FLAG)
- panic("Did not find a dictionary entry, flag is %ld", index_block_flag);
- (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
- *index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
- *number_of_occurances = read_bytes(NUMBER_OF_OCCURANCES_SIZE,
- stream);
- fgets(temp, MAX_WORD_LENGTH + 2, stream); /* 2 is for the \n and '\0' */
-
- /* trim the \n */
- if(temp[strlen(temp) - 1] == '\n'){
- temp[strlen(temp) - 1] = '\0';
- }
- strcpy(word, temp);
-
- return(0);
- }
-
- #define testing
-
- #ifdef testing
-
- long
- read_dictionary_index_block(index_block,
- last_index_block,
- index_block_size,
- number_of_occurances,
- word,
- stream)
- long index_block;
- long *last_index_block;
- long *index_block_size;
- long *number_of_occurances;
- char *word;
- FILE *stream;
- {
- /* this read the dictionary index block from the index stream.
- returns 0 if it succeeds. */
- long answer;
- s_fseek(stream, index_block, SEEK_SET);
- if(0 > (answer = next_dictionary_index_block(index_block_size,
- number_of_occurances,
- word,
- stream)))
- panic("read dictionary block %ld failed", index_block);
-
- *last_index_block = 0;
- return(answer);
- }
-
- #endif /* def testing */
-
- long allocate_index_block(how_large,stream)
- long how_large;
- FILE* stream;
- {
- /* This returns a pointer for an index block.
- It creates the space and writes the header.
- how_large includes the header.
- Returns the block_number (the byte address of first byte in the block */
- /* this assumes that the stream is positioned at the end */
- long before_length = ftell(stream); /* file_length(stream); */
- /* grow the file */
- /* in this implementation, the file is always filled by the fwrite,
- * so it will grow itself.
- grow_file(stream, before_length + how_large);
- * file length leaves the stream at the end, so we are all set.
- s_fseek(stream, before_length, SEEK_SET);
- */
- write_bytes(INDEX_BLOCK_FULL_FLAG, /* in this version all are full */
- INDEX_BLOCK_FLAG_SIZE,
- stream);
- write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, stream);
- write_bytes(how_large, INDEX_BLOCK_SIZE_SIZE, stream);
- return(before_length);
- }
-
- #ifdef testing
-
- static void scan_index_blocks _AP((char* filename,boolean verbose));
-
- static void scan_index_blocks(filename,verbose)
- char *filename;
- boolean verbose;
- {
- /* this is a debugging routine for checking the inverted index file */
- long current_index_block = INDEX_HEADER_SIZE;
- FILE *stream = s_fopen(filename, "rb");
- long length = file_length(stream);
-
- s_fseek(stream, current_index_block, SEEK_SET);
- printf("File length %ld\n", length);
-
- while(current_index_block < length){
- long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
- long next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
- long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
- if(flag == INDEX_BLOCK_DICTIONARY_FLAG){
- long last_index_block;
- long index_block_size;
- long number_of_occurances;
- char word[MAX_WORD_LENGTH + 1];
- if(0 > read_dictionary_index_block(current_index_block,
- &last_index_block,
- &index_block_size,
- &number_of_occurances,
- word,
- stream))
- panic("read dictionary index block failed");
- if(verbose)
- printf("%5ld: size %3ld Dictionary entry '%s', number of occurances %ld last block %ld\n",
- current_index_block, index_block_size, word,
- number_of_occurances, next_index_block);
- }
- else if(flag == INDEX_BLOCK_NOT_FULL_FLAG){
- if(verbose)
- printf("%5ld: size %3ld Not full, valid entries %ld\n",
- current_index_block, index_block_size, next_index_block);
- }
- else if(flag == INDEX_BLOCK_FULL_FLAG){
- if(verbose)
- printf("%5ld: size %3ld full block, next block %ld\n",
- current_index_block, index_block_size, next_index_block);
- }
- else
- panic("bad entry %ld (ftell %ld), flag was %ld",
- current_index_block,
- ftell(stream), flag);
- current_index_block += index_block_size;
- s_fseek(stream, current_index_block, SEEK_SET);
- }
- s_fclose(stream);
- }
-
- #endif /* def testing */
-
- #define COPY_BLOCK_BUFFER_LENGTH 1000
-
- static long copy_stream _AP((FILE* from_stream,FILE* to_stream,long n));
-
- static long copy_stream(from_stream,to_stream,n)
- FILE *from_stream;
- FILE *to_stream;
- long n;
- {
- char buffer[COPY_BLOCK_BUFFER_LENGTH];
- while(n > 0){
- /* keep reading until we are done */
- long amount_read = fread(buffer, sizeof(char),
- MIN(n, COPY_BLOCK_BUFFER_LENGTH),
- from_stream);
- if(amount_read == 0 || amount_read == EOF)
- return(-1);
- if(amount_read != fwrite(buffer, sizeof(char), amount_read, to_stream))
- return(-1);
- n -= amount_read;
- }
- return(0);
- }
-
-
- static void merge_blocks _AP((char* word,FILE* from_stream_1,
- FILE* from_stream_2,FILE* to_stream));
-
- static void merge_blocks(word,from_stream_1,from_stream_2,to_stream)
- char* word;
- FILE *from_stream_1;
- FILE *from_stream_2;
- FILE *to_stream;
- /* puts from_stream_1 first into to_stream then from_stream_2*/
- {
- long flag;
- long index_block_size;
- long other_block_size;
-
- flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_1);
- (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_1);
- index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_1);
-
- if(flag == EOF) return;
- if(flag != INDEX_BLOCK_FULL_FLAG)
- panic("the next index block is not a full block");
-
- flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_2);
- (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_2);
- other_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_2);
- if(flag != INDEX_BLOCK_FULL_FLAG)
- panic("the next index block is not a full block");
-
- write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream);
- write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, to_stream);
- if((index_block_size + other_block_size) >
- (1L << (INDEX_BLOCK_SIZE_SIZE * 8))){
- /* then the block is too large to be represented in the
- index_block_size_size. This routine takes the radical step to
- eliminate it completely. The "right" thing to do is to
- chain blocks, but I dont think it is worth it since this should not
- happen very often. If it does happen often then bump up
- INDEX_BLOCK_SIZE_SIZE. */
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Can not merge the index block for %s, since it would create one that is too big. Deleting it.",word);
- write_bytes(INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_SIZE_SIZE, to_stream);
- s_fseek(from_stream_1, index_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR);
- s_fseek(from_stream_2, other_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR);
- }
- else{ /* copy away */
- write_bytes(index_block_size + other_block_size - INDEX_BLOCK_HEADER_SIZE,
- INDEX_BLOCK_SIZE_SIZE, to_stream);
- copy_stream(from_stream_1, to_stream, index_block_size - INDEX_BLOCK_HEADER_SIZE);
- copy_stream(from_stream_2, to_stream, other_block_size - INDEX_BLOCK_HEADER_SIZE);
- }
- }
-
- static void make_dummy_block _AP((FILE* from_stream_1,FILE* from_stream_2,
- FILE* to_stream));
-
- static void make_dummy_block(from_stream_1,from_stream_2,to_stream)
- FILE *from_stream_1;
- FILE *from_stream_2;
- FILE *to_stream;
- /* deletes block from_stream_1 and from_stream_2 and makes a 0 length
- block on to_stream */
- {
- long flag;
- long index_block_size;
- long other_block_size;
-
- flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_1);
- (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_1);
- index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_1);
-
- if(flag == EOF) return;
- if(flag != INDEX_BLOCK_FULL_FLAG)
- panic("the next index block is not a full block");
-
- flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_2);
- (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_2);
- other_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_2);
- if(flag != INDEX_BLOCK_FULL_FLAG)
- panic("the next index block is not a full block");
-
- write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream);
- write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, to_stream);
- write_bytes(INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_SIZE_SIZE, to_stream);
- s_fseek(from_stream_1, index_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR);
- s_fseek(from_stream_2, other_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR);
- }
-
- static void copy_block _AP((FILE* from_stream,FILE* to_stream));
-
- static void copy_block(from_stream,to_stream)
- FILE *from_stream;
- FILE *to_stream;
- /* copies an index block from one stream to another */
- { /* copies an index block from from_stream to to_stream */
- long flag;
- long next_index_block;
- long index_block_size;
-
- flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream);
- next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream);
- index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream);
-
- if(flag == EOF) return;
- write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream);
- write_bytes(next_index_block, NEXT_INDEX_BLOCK_SIZE, to_stream);
- write_bytes(index_block_size, INDEX_BLOCK_SIZE_SIZE, to_stream);
- /* copy the real stuff */
- copy_stream(from_stream, to_stream,
- index_block_size - INDEX_BLOCK_HEADER_SIZE);
- }
-
- /* ================== */
- /* === Merge File === */
- /* ================== */
-
- static boolean merge_index_file _AP((long from_version, long into_version,
- database* db,
- boolean generate_dictionary));
-
- /* Merges a index file (from_version) into another (into_version).
- Returns true if successful */
- static boolean merge_index_file(into_version, from_version,
- db,generate_dictionary)
- long into_version;
- long from_version;
- database* db;
- boolean generate_dictionary;
- /*
- This is done by merging both into version -1 and then,
- renames it to into_version, then deletes from_version. */
- {
- char input_filename_1[MAX_FILENAME_LEN]; /* into_version file */
- char input_filename_2[MAX_FILENAME_LEN]; /* from_version file */
- char output_filename[MAX_FILENAME_LEN]; /* version -1 */
-
- FILE *input_stream_1;
- FILE *input_stream_2;
- FILE *output_stream;
- char input_word_1[MAX_WORD_LENGTH + 1];
- char input_word_2[MAX_WORD_LENGTH + 1];
- long number_of_words_1;
- long number_of_words_2;
-
- if(generate_dictionary) {
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Merging version %ld into %ld generating dictionary.",
- from_version, into_version);
- }
- else {
- waislog(WLOG_MEDIUM, WLOG_INDEX, "Merging version %ld and %ld",
- into_version, from_version);
- }
-
- index_filename_with_version(into_version, input_filename_1, db);
- index_filename_with_version(from_version, input_filename_2, db);
- index_filename_with_version(-1L, output_filename, db);
-
-
- if(NULL == (input_stream_1 = s_fopen(input_filename_1, "rb")))
- return(false);
-
- if(NULL == (input_stream_2 = s_fopen(input_filename_2, "rb")))
- return(false);
-
- if(NULL == (output_stream = s_fopen(output_filename, "wb")))
- return(false);
-
- {
- long index_block_size_1;
- long number_of_occurances_1;
- long index_block_size_2;
- long number_of_occurances_2;
- number_of_words_1 = read_bytes(INDEX_HEADER_SIZE, input_stream_1);
- number_of_words_2 = read_bytes(INDEX_HEADER_SIZE, input_stream_2);
- /* printf("Number of words 1: %ld, 2: %ld\n", number_of_words_1, number_of_words_2); */
- db->number_of_words =
- ((EOF != number_of_words_1) ? number_of_words_1 : 0) +
- ((EOF != number_of_words_2) ? number_of_words_2 : 0);
-
- write_bytes(db->number_of_words, INDEX_HEADER_SIZE, output_stream);
- if(generate_dictionary)
- init_dict_file_for_writing(db);
-
- next_dictionary_index_block(&index_block_size_1,
- &number_of_occurances_1,
- input_word_1,
- input_stream_1);
- next_dictionary_index_block(&index_block_size_2,
- &number_of_occurances_2,
- input_word_2,
- input_stream_2);
- while(true){
- long strlen_1 = strlen(input_word_1);
- long strlen_2 = strlen(input_word_2);
- long compare = strcmp(input_word_1, input_word_2); /* +1 if 1 is bigger */
-
- if((0 == strlen_1) &&
- (0 == strlen_2)){
- /* printf("Done with merge\n"); */
- /* then we are done. */
- break;
- }
- else if((0 != strlen_1) && (0 != strlen_2) && (0 == compare)){
- /* they are equal */
- /* printf("Merging word %s and %s\n", input_word_1, input_word_2); */
- write_dictionary_index_block(number_of_occurances_1 +
- number_of_occurances_2,
- input_word_1,
- output_stream);
- if(generate_dictionary)
- add_word_to_dictionary(input_word_1, ftell(output_stream),
- number_of_occurances_1 + number_of_occurances_2,
- db);
- /* copy file 1 first. this assumes that file 1 was indexed before
- file 2 */
- if((number_of_occurances_1+number_of_occurances_2) > MAX_OCCURANCES){
- /* too many already, just make a dummy (0 length) */
- if((number_of_occurances_1 < MAX_OCCURANCES) &&
- (number_of_occurances_2 < MAX_OCCURANCES)) {
- /* only print the first time */
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Deleting word '%s' since it has %ld occurances (limit %ld).",
- input_word_1, number_of_occurances_1+number_of_occurances_2,
- MAX_OCCURANCES);
- }
- make_dummy_block(input_stream_1, input_stream_2, output_stream);
- }
- else{
- merge_blocks(input_word_1,input_stream_1,input_stream_2,
- output_stream);
- }
- next_dictionary_index_block(&index_block_size_1,
- &number_of_occurances_1,
- input_word_1,
- input_stream_1);
- next_dictionary_index_block(&index_block_size_2,
- &number_of_occurances_2,
- input_word_2,
- input_stream_2);
- }
- else if(((0 == strlen_1) && (0 != strlen_2) && (0 > compare)) ||
- ((0 != strlen_1) && (0 != strlen_2) && (0 < compare))){
- /* write from block 2 */
- /* printf("From block 2: writing word '%s' not '%s'\n",
- input_word_2, input_word_1); */
- write_dictionary_index_block(number_of_occurances_2, input_word_2,
- output_stream);
- if(generate_dictionary)
- add_word_to_dictionary(input_word_2, ftell(output_stream),
- number_of_occurances_2, db);
- copy_block(input_stream_2, output_stream);
- next_dictionary_index_block(&index_block_size_2,
- &number_of_occurances_2,
- input_word_2,
- input_stream_2);
- }
- else{
- /* write from block 1 */
- /* printf("From block 1: writing word '%s' not '%s'\n",
- input_word_1, input_word_2); */
- write_dictionary_index_block(number_of_occurances_1, input_word_1,
- output_stream);
- if(generate_dictionary)
- add_word_to_dictionary(input_word_1, ftell(output_stream),
- number_of_occurances_1, db);
- copy_block(input_stream_1, output_stream);
- next_dictionary_index_block(&index_block_size_1,
- &number_of_occurances_1,
- input_word_1,
- input_stream_1);
- }
- }
- }
- s_fclose(input_stream_1);
- s_fclose(input_stream_2);
- s_fclose(output_stream);
- /* check resulting file */
- /* check_index_file(output_filename); */
- /* scan_index_blocks(output_filename); */
-
- /* delete the input files */
- remove(input_filename_1);
- remove(input_filename_2);
- rename(output_filename, input_filename_1);
-
- #ifdef THINK_C
- /* set the mac file type to INDX */
- setFileType(input_filename_1, WAIS_INDEX_FILE_TYPE, CREATOR);
- #endif /* THINK_C */
- return(true);
- }
-
- /* only works on positive, non-zero numbers, and is slow to boot, yippie */
- static long logcount _AP((long number));
- static long logcount(number)
- long number;
- {
- long answer = 0;
- for( ; number > 0; number = number >> 1)
- answer++;
- return(answer);
- }
-
-
- static void merge_index_files _AP((database* db));
-
- static void merge_index_files(db)
- database *db;
- /* this merges all the temporary inverted files into a large on
- * and creates a dictionary.
- * This is done in a logrithmic merge of the files.
- * An n-ary merge would be faster of course, but what the heck...
- */
- {
- /* this version does it two at a time */
- char filename[MAX_FILENAME_LEN];
- char filename2[MAX_FILENAME_LEN];
- long level;
- long final_level;
- long number_of_files_to_merge = db->index_file_number;
- /* at level 0, merge 0,1->0; 2,3->2; 4,5->4; 6,7->6;
- at level 1, merge 0,2->0; 4,6->4;
- at level 2, merge 0,4->0;
- stop.
- If there is only 1 file, then dont merge (final_level will be -1),
- the dictionary will be generated by merge after the for loop.
- */
- final_level = logcount(number_of_files_to_merge - 1) - 1;
- for(level = 0; level <= final_level; level++){
- long into_version;
- for(into_version = 0;
- into_version < number_of_files_to_merge;
- into_version += (2 << level)){
- long from_version = into_version + (1 << level);
- if(from_version < number_of_files_to_merge){
- boolean generate_dictionary; /* if this is the final level and
- there is no .inv file then yes */
- if(level == final_level){
- if(probe_file(index_filename(filename, db)))
- generate_dictionary = false;
- else generate_dictionary = true;
- }
- else{
- generate_dictionary = false;
- }
-
- /* printf("Level %d (out of %d) merging into %d from %d\n",
- level, final_level, into_version, from_version); */
- if(false == merge_index_file(into_version, from_version, db,
- generate_dictionary))
- panic("Error in merging file %ld into %ld",
- from_version, into_version);
- }
- }
- } /* done merging */
- /* if there is a .inv file, then we are adding to an existing db. merge */
- if(probe_file(index_filename(filename, db))){
- /* printf("Merging into existing db\n"); */
- /* rename the .inv file to 1, then merge it into 0 */
- rename(index_filename(filename, db),
- index_filename_with_version(1L, filename2, db));
- merge_index_file(0L, 1L, db, true);
- }
- else if(number_of_files_to_merge == 1){
- /* then we have to generate a dictionary for this one file.
- create a dummy file in 1, then merge that while making a dictionary
- */
- touch_file(index_filename_with_version(1, filename, db));
- merge_index_file(0L, 1, db, true);
- }
- /* rename 0 into the final name */
- rename(index_filename_with_version(0L,filename2, db),
- index_filename(filename, db));
- }
-
- /* ============================================ */
- /* === Flushing the memory version of the === */
- /* === word hashtable to disk files === */
- /* ============================================ */
-
- static void flush_word_entry_to_file _AP((word_entry* wrd_entry,
- database* db));
-
- static void flush_word_entry_to_file(wrd_entry,db)
- word_entry* wrd_entry;
- database* db;
- {
- /* In this version, each word_entry is made into an index block.
- This means that there may be many small index blocks if the memory
- can not hold many files worth of data. This approach was taken
- for simplicities sake and it might be the best approach if there
- is a small vocabulary.
-
- This assumes that the index stream is positioned at the end.
- */
-
- if(wrd_entry->number_of_occurances > MAX_OCCURANCES){
- /* there are too many of this word, do nothing.
- * this may result in some having been written before, or
- * no occurances of this word might be recorded. Is this right?
- * if the first MAX_OCCURANCES want to be recored, then
- * add this clause to this condition:
- * && wrd_entry->starting_block_number != 0
- */
- return;
- }
- if((wrd_entry->current_memory_ptr - wrd_entry->memory_ptr) >
- (1L << (8 * INDEX_BLOCK_SIZE_SIZE)))
- return; /* there are too many entries to store away, therefore throw it all away */
-
- if(NULL == wrd_entry->memory_ptr){
- /* not entries to write out */
- return;
- }
- if(0 == (wrd_entry->current_memory_ptr - wrd_entry->memory_ptr)){
- return; /* there are no word entries to write */
- }
- /* printf("Flushing word %s\n", wrd_entry->word); */
- /* if this is the entry for this word, the put in the dictionary
- entry in the index file */
- write_dictionary_index_block(wrd_entry->number_of_occurances,
- wrd_entry->word,
- db->index_stream);
-
- db->number_of_words++;
-
- /* allocate and write the new block */
- allocate_index_block(wrd_entry->current_memory_ptr -
- wrd_entry->memory_ptr +
- INDEX_BLOCK_HEADER_SIZE,
- db->index_stream);
- /* cprintf(PRINT_AS_INDEXING, "New block number: %ld\n", new_block); */
- if((wrd_entry->current_memory_ptr - wrd_entry->memory_ptr) !=
- fwrite(wrd_entry->memory_ptr, 1L, (wrd_entry->current_memory_ptr -
- wrd_entry->memory_ptr),
- db->index_stream))
- panic("Write failed");
- /* free the memory for the block written out */
- free_word_occurance_block(wrd_entry->memory_ptr);
- wrd_entry->memory_ptr = NULL;
- wrd_entry->current_memory_ptr = NULL;
- wrd_entry->memory_size = 0;
- wrd_entry->current_doc_id = 0;
- }
-
-
- /* This flushes all of the memory version of the word hashtable to disk.
- * this is called when the hashtable is filling up or we have
- * accumulated enough words. If completely is true, then it will merge
- * all the intermediate files.
- */
- void flush_memory_hashtable_to_disk(db,completely)
- database* db;
- boolean completely;
- {
- /* map over the memory word hashtable and write the entries to disk */
- long i;
- char filename[1000];
-
- db->index_stream =
- s_fopen(index_filename_with_version(db->index_file_number++, filename, db),
- "wb");
- if(NULL == db->index_stream)
- panic("Could not open file %s to insert index",
- index_filename_with_version(db->index_file_number, filename, db));
-
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Flushing %ld different words, %ld total words to disk...",
- db->the_word_memory_hashtable->number_of_entries,
- db->the_word_memory_hashtable->number_of_words_indexed);
-
- db->number_of_words = 0;
-
- write_bytes(0l, INDEX_HEADER_SIZE, db->index_stream);
- collapse_word_memory_hashtable(db->the_word_memory_hashtable); /* compact the entries to the bottom */
- sort_word_memory_hashtable(db->the_word_memory_hashtable); /* sort the entries */
- for(i = 0; i < db->the_word_memory_hashtable->number_of_entries; i++){
- if(NULL != db->the_word_memory_hashtable->contents[i]){
- /* then we have an entry to write */
- word_entry* the_word_entry = db->the_word_memory_hashtable->contents[i];
- flush_word_entry_to_file(the_word_entry, db);
- }
- }
-
- /* write out the number of entries into the index_file header */
- s_fseek(db->index_stream, 0L, SEEK_SET);
- write_bytes(db->number_of_words, INDEX_HEADER_SIZE, db->index_stream);
- s_fclose(db->index_stream);
-
- /* since everything is written out, we can flush these. */
- flush_word_occurance_buffers();
- clear_word_memory_hashtable(db->the_word_memory_hashtable);
- /* add the stopwords to the index for the next session. */
- add_stop_words(db->the_word_memory_hashtable);
-
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Done flushing");
- /* scan_index_blocks(filename); */
-
- if(completely){
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Merging %ld files",
- db->index_file_number);
- merge_index_files(db);
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Done merging.");
- }
- }
-
- long finished_add_word(db)
- database *db;
- {
- flush_memory_hashtable_to_disk(db,true);
- return(0); /* successful */
- }
-
- long init_add_word(db, hashtable_size, flush_after_n_words)
- database *db;
- long hashtable_size;
- long flush_after_n_words;
- {
- db->the_word_memory_hashtable =
- init_word_memory_hashtable(hashtable_size,
- flush_after_n_words,
- db->the_word_memory_hashtable);
- return(0);
- }
-